
*Import the merged registration data: 

use "$root/Data/Produced/EV_alldata_PID_new_withKM.dta", clear


*Only keep relevant variables:
keep PID marke_typ treibstoffcode average_use year


**Link to the prepared tax data: 

frame create tax
frame change tax

use "$root/Data/Produced/Grunddaten_all.dta", clear

*Keep only the relevant information:
collapse (lastnm) JAHRGANG_DT JAHRGANG_EP ANZ_KAG (mean) TOT_EINK TOTVERM, by(PID)

*Generate the relevant information: 
gen hhsize=1
replace hhsize=2 if JAHRGANG_EP!=.
replace hhsize=3 if JAHRGANG_EP!=.

*Define homeowner:
gen homeowner=(ABZ_HYPO<-1200)

*Define hhsize:
gen married=(JAHRGANG_EP!=.)
gen hhsize=0
replace hhsize=1+married+ANZ_KAG
replace hhsize=round(hhsize)

*recode hhsize to 5+: 
replace hhsize=5 if hhsize>5

**Generate the age_groups: 
gen age_group=. 
replace age=1 if JAHRGANG_DT>1979 & JAHRGANG_DT!=.
replace age=2 if JAHRGANG_DT<=1979 & JAHRGANG_DT!=. & JAHRGANG_DT>1959
replace age=3 if JAHRGANG_DT<=1959 & JAHRGANG_DT!=.

gen age=2019-JAHRGANG_DT if JAHRGANG_DT!=.



*Merge the information from the tax data: 
frame change default

frlink 1:1 PID, frame(tax)

frget TOT_EINK TOTVERM hhsize homeowner age_group age, from(tax)


**Add the relevant information from the building data: 
frame create build
frame change build

use "$root/Data/Produced/build_complete.dta", clear

*Re-collapse the artificially created panel data: 
keep if year==2017 

*bring the relevant information to the data: 
frame change default 

frlink 1:1 PID, frame(build)


**Keep the relevant information: 
frget GKODX GKODY urban3 PVhh, from(build)

drop build


**Generate the two variables distance to EV and charging station density: 

frame create charging
frame change charging
use "$root/Data/Original/EV_charge.dta", clear


su obs
global tot_chargers=`r(max)'

frame change default
frame put PID GKODX GKODY , into(car_coord)

frame change car_coord

rename GKODX X_car
rename GKODY Y_car

gen nvals=_n
su nvals
global max=`r(max)'


forvalues v=1(1) $max {
preserve
qui keep if nvals==`v'
qui expand $tot_chargers, g(obs)
qui bys PID: replace obs=_n
qui frlink 1:1 obs, frame(charging) gen(charge_link)
qui frget GKODX GKODY, from(charge_link)
qui gen distance=.
qui replace distance = sqrt((GKODX-X_car)^2+(GKODY-Y_car)^2) 
qui gen range5=(distance<5000)

qui collapse (min) distance (sum) range5 , by(PID)
tempfile charge_`v'
save `charge_`v''

restore

}

clear
forvalues v=1(1) $max {

append using `charge_`v''

}


frame change default

frlink 1:1 PID, frame(car_coord)
frget range5, from(car_coord)

drop car_coord


*Generate the variable distance to EV which measures the distance to the closest EV owner: 


preserve
gen EV=(drivetype==3)

    * Only keep relevant information to speed up computation:
		keep PID EV GKODX GKODY
	keep if GKODX!=.

	gsort -EV         // sort in descending order
	gen distance=.

	* Loop through all EV observation and calculate distance of that EV to 
	* each observation
	* Replace variable "dist_to_EV" if the resulting distance is smaller than the
	* current value
	gen dist_to_EV = 99999
	count if EV==1
	local numEV = `r(N)'
	forvalues i=1/`numEV' {
		qui replace distance = sqrt((GKODX-GKODX[`i'])^2+(GKODY-GKODY[`i'])^2) 
		qui replace dist_to_EV = distance if distance!=. & distance!=0 & distance<dist_to_EV 
	}
	drop GKODX GKODY distance
	tempfile dist
	save `dist', replace
restore
	


**Now re-merge the temporary file: 
merge 1:1 PID using `dist'
drop _



** Now we standardise the vehicle data to match it to the dataset car_options which was generated:
replace marke_typ=lower(marke_typ)
replace marke_typ=trim(marke_typ)
replace marke_typ=subinstr(marke_typ,"volkswagen","vw",.)
replace marke_typ=subinstr(marke_typ,"wv","vw",.)
replace marke_typ=subinstr(marke_typ,"hunday","hyundai",.)
replace marke_typ=subinstr(marke_typ,"wv","vw",.)
replace marke_typ=subinstr(marke_typ,"mercedes benz","mercedes",.)
replace marke_typ=subinstr(marke_typ,"mercedes-benz","mercedes",.)
replace marke_typ=subinstr(marke_typ,"mercedes-amg","mercedes amg",.)
replace marke_typ=subinstr(marke_typ,"mc laren","mclaren",.)
replace marke_typ=subinstr(marke_typ,"alfa romeo","alfa-romeo",.)
replace marke_typ=subinstr(marke_typ,"alfra-romeo","alfa-romeo",.)
replace marke_typ=subinstr(marke_typ,"bmw alpina","alpina",.)
replace marke_typ=subinstr(marke_typ,"bmw-alpina","alpina",.)
replace marke_typ=subinstr(marke_typ,"alpine","alpina",.)
replace marke_typ=subinstr(marke_typ,"land rover","land-rover",.)
replace marke_typ=subinstr(marke_typ,"landrover","land-rover",.)
replace marke_typ=subinstr(marke_typ,"rolls royce","rolls-royce",.)
replace marke_typ=subinstr(marke_typ,"aston martin","aston-martin",.)
replace marke_typ=subinstr(marke_typ,"ë","e",.)
replace marke_typ=subinstr(marke_typ,"quattro audi","audi",.)
replace marke_typ=subinstr(marke_typ,"range rover","land-rover",.)
replace marke_typ=subinstr(marke_typ,"alfa-romeo alfa-romeo","alfa-romeo",.)
replace marke_typ=subinstr(marke_typ,"alfa-romeo alfa","alfa-romeo",.)
replace marke_typ=subinstr(marke_typ,")","",.)
replace marke_typ=subinstr(marke_typ,"(","",.)
replace marke_typ=subinstr(marke_typ,"qu.","quattro",.)
replace marke_typ=subinstr(marke_typ,"-cng-technik","",.)
replace marke_typ=subinstr(marke_typ,"-dangel","",.)
replace marke_typ=subinstr(marke_typ,"stelvio","stelvio ",.)
replace marke_typ=subinstr(marke_typ,"giulia","giulia ",.)
replace marke_typ=subinstr(marke_typ,"discovery","discovery ",.)
replace marke_typ=subinstr(marke_typ,"discov.","discovery ",.)
replace marke_typ=subinstr(marke_typ,"disco.","discovery ",.)
replace marke_typ=subinstr(marke_typ,"octavia","octavia ",.)
replace marke_typ=subinstr(marke_typ,"hev"," hev",.)
replace marke_typ=subinstr(marke_typ,"spacetourer","spacetourer ",.)
replace marke_typ=subinstr(marke_typ,"rangerover","rangerover ",.)
replace marke_typ=subinstr(marke_typ,"grandc-max","grandc-max ",.)
replace marke_typ=subinstr(marke_typ,"vito","vito ",.)
replace marke_typ=subinstr(marke_typ,"grandcherokee","gr.cherokee ",.)
replace marke_typ=subinstr(marke_typ,"grand cherokee","gr.cherokee ",.)
replace marke_typ=subinstr(marke_typ,"baleno","baleno ",.)
replace marke_typ=subinstr(marke_typ, "grand vitara","vitara",.)
replace marke_typ=subinstr(marke_typ, "gr. vitara","vitara",.)
replace marke_typ=subinstr(marke_typ, "grd vitara","vitara",.)
replace marke_typ=subinstr(marke_typ, "grd.vitara","vitara",.)
replace marke_typ=subinstr(marke_typ, "vitara","vitara ",.)
replace marke_typ=subinstr(marke_typ, "forester","forester ",.)
replace marke_typ=subinstr(marke_typ, "rav4","rav-4",.)
replace marke_typ=subinstr(marke_typ, "rav4","rav-4",.)
replace marke_typ=subinstr(marke_typ, "mokka-x","mokka",.)
replace marke_typ=subinstr(marke_typ, "c hevrolet","chevrolet",.)
replace marke_typ=subinstr(marke_typ, "citroen ds","ds ds",.)
replace marke_typ=subinstr(marke_typ, "jumpy spacetourer","spacetourer",.)
replace marke_typ=subinstr(marke_typ, "jumpyspacet","spacetourer",.)
replace marke_typ=subinstr(marke_typ, "jumper","spacetourer",.)
replace marke_typ=subinstr(marke_typ, "doblo","doblò",.)
replace marke_typ=subinstr(marke_typ, "grand c-max","grc-max",.)
replace marke_typ=subinstr(marke_typ, "grandc-max","grc-max",.)
replace marke_typ=subinstr(marke_typ, "i30n","i30",.)
replace marke_typ=subinstr(marke_typ, "i30w","i30",.)
replace marke_typ=subinstr(marke_typ, "tuscon","tucson",.)
replace marke_typ=subinstr(marke_typ, "q30s","q30",.)
replace marke_typ=subinstr(marke_typ, "fiesta","fiesta ",.)
replace marke_typ=subinstr(marke_typ, "grand santa","santa ",.)
replace marke_typ=subinstr(marke_typ, "grandsanta","santa ",.)
replace marke_typ=subinstr(marke_typ, "rrsport","rr sport ",.)
replace marke_typ=subinstr(marke_typ, "ceed's","ceed",.)
replace marke_typ=subinstr(marke_typ, "b180d","b 180d",.)
replace marke_typ=subinstr(marke_typ, "b220d","b 220d",.)
replace marke_typ=subinstr(marke_typ, "c200","c 200",.)
replace marke_typ=subinstr(marke_typ, "c220","c 220",.)
replace marke_typ=subinstr(marke_typ, "c43","c 43",.)
replace marke_typ=subinstr(marke_typ, "cla200","cla 200",.)
replace marke_typ=subinstr(marke_typ, "cla250","cla 250",.)
replace marke_typ=subinstr(marke_typ, "cla45","cla 45",.)
replace marke_typ=subinstr(marke_typ, "e200d","e 200d",.)
replace marke_typ=subinstr(marke_typ, "e220","e 220",.)
replace marke_typ=subinstr(marke_typ, "g500","g 500",.)
replace marke_typ=subinstr(marke_typ, "gla200","gla 200",.)
replace marke_typ=subinstr(marke_typ, "gla220","gla 220",.)
replace marke_typ=subinstr(marke_typ, "gla250","gla 250",.)
replace marke_typ=subinstr(marke_typ, "mini 3door","mini ",.)
replace marke_typ=subinstr(marke_typ, "mini 5door","mini ",.)
replace marke_typ=subinstr(marke_typ, "mini cabrio","mini ",.)
replace marke_typ=subinstr(marke_typ, "mini jcw","mini ",.)
replace marke_typ=subinstr(marke_typ, "mini john","mini ",.)
replace marke_typ=subinstr(marke_typ, "mitsubishi mitsubishi","mitsubishi ",.)
replace marke_typ=subinstr(marke_typ, "mitsubishi outland.","mitsubishi outlander ",.)
replace marke_typ=subinstr(marke_typ, "qasqai","qashqai ",.)
replace marke_typ=subinstr(marke_typ, "x- trail","x-trail ",.)
replace marke_typ=subinstr(marke_typ, "crosslandx","crossland ",.)
replace marke_typ=subinstr(marke_typ, "x- trail","x-trail ",.)
replace marke_typ=subinstr(marke_typ, "expert traveller","traveller ",.)
replace marke_typ=subinstr(marke_typ, "experttr.","traveller ",.)
replace marke_typ=subinstr(marke_typ, "alhambra","alhambra ",.)
replace marke_typ=subinstr(marke_typ, "cupra ateca","ateca ",.)
replace marke_typ=subinstr(marke_typ, "citigo","citigo ",.)
replace marke_typ=subinstr(marke_typ, "superbc","superb ",.)
replace marke_typ=subinstr(marke_typ, "superd","superb ",.)
replace marke_typ=subinstr(marke_typ, "smart cabrio","smart ",.)
replace marke_typ=subinstr(marke_typ, "smart eq","smart ",.)
replace marke_typ=subinstr(marke_typ, "impreza","impreza ",.)
replace marke_typ=subinstr(marke_typ, "legacy outback","outback ",.)
replace marke_typ=subinstr(marke_typ, "levorg","levorg ",.)
replace marke_typ=subinstr(marke_typ, "subaru wrx","subaru xv ",.)
replace marke_typ=subinstr(marke_typ, "subaru xv","subaru xv ",.)
replace marke_typ=subinstr(marke_typ, "sx4s-cross","sx4 s-cross ",.)
replace marke_typ=subinstr(marke_typ, "audi a 4","audi a4 ",.)
replace marke_typ=subinstr(marke_typ, "vantage v8","v8 vantage ",.)
replace marke_typ=subinstr(marke_typ, "bmw 218","bmw 218d ",.)
replace marke_typ=subinstr(marke_typ, "bmw 225e","bmw 225xe ",.)
replace marke_typ=subinstr(marke_typ, "bmw 318","bmw 318d ",.)
replace marke_typ=subinstr(marke_typ, "bmw 323 i","bmw 323i ",.)
replace marke_typ=subinstr(marke_typ, "bmw 520 ","bmw 520d ",.)
replace marke_typ=subinstr(marke_typ, "fiat 500 x ","fiat 500x ",.)
replace marke_typ=subinstr(marke_typ, "fiat 500c ","fiat 500 ",.)
replace marke_typ=subinstr(marke_typ,"land-rover land-rover","land-rover",.)
replace marke_typ=subinstr(marke_typ,"mazda 3sport","mazda 3 sport",.)
replace marke_typ=subinstr(marke_typ,"mercedes a200","mercedes a 200",.)
replace marke_typ=subinstr(marke_typ,"volvo xc 90","volvo xc90",.)
replace marke_typ=subinstr(marke_typ,"volvo xc 40","volvo xc40",.)
replace marke_typ=subinstr(marke_typ,"mercedes a200","mercedes a 200",.)
replace marke_typ=subinstr(marke_typ,"passatv","passat",.)
replace marke_typ=subinstr(marke_typ,"conti.","continental",.)
replace marke_typ=subinstr(marke_typ,"bmw 218d i","bmw 218i",.)


split marke_typ

rename marke_typ1 MAKName
rename marke_typ2 TYPName1
rename marke_typ3 TYPName2

replace TYPName1=TYPName1 + " " + TYPName2 if MAKName=="tesla"



gen drivetype=.
replace drivetype=1 if treibstoffcode=="B"
replace drivetype=2 if treibstoffcode=="D"
replace drivetype=4 if treibstoffcode=="C" | treibstoffcode=="F"
replace drivetype=3 if treibstoffcode=="E"

label define treibstoffcode 1 "gasoline" 2 "diesel" 3 "electric" 4 "hybrid"
drop treibstoffcode

replace TYPName1=trim(TYPName1)
replace MAKName=trim(MAKName)

rename year year_reg 



merge m:1 MAKName TYPName1 drivetype year_reg using "$root/Data/Produced/car_options.dta"

keep if _merge==3
drop _merge

drop if missing(PRHnewprice)



** Impute the driven kilometres for the households that have no observation: 


gen log_km=log(average_use)

gen income=TOT_EINK/1000
gen wealth=TOTVERM/1000
xtile inc_group=TOT_EINK, nquantiles(4)
xtile wealth_group=TOTVERM, nquantiles(4)

reg log_km i.urban3 dist_to_EV range5 i.PVhh age i.age_group i.hhsize  i.homeowner i.fuel_type car_type_FE* TYPKW i.wealth_group  i.inc_group lIncome


predict km_fit, xb
gen km_predict=exp(km_fit)


su km_predict, d

replace average_use=km_predict if missing(average_use)
replace average_use=km_predict if average_use<4000
replace average_use=km_predict if average_use>30000


rename average_use average_use_var


*Drop all the observations before 2017 since we are only modelling recent decisions: 
keep if year_reg>2016

** See how the data is distributed and which options might not be desirable to keep: 

egen car_combo=group(MAKName TYPName1 drivetype)

egen nb_cars=count(PID), by(car_combo)

egen nb_brand=count(PID), by(MAKName)

egen nb_cars_ann=count(PID), by(car_combo year_reg)


**Drop options with less than 5 registrations per brand and less than 2 registrations per car_combo
drop if nb_brand<5

drop if nb_cars<=2 & drivetype<3



frame put PID average_use_var TOT_EINK TOTVERM GKODX GKODY PVhh homeowner hhsize age age_group range5 dist_to_EV urban3, into(soc_chars)


frame put MAKName TYPName1 drivetype car_size  car_height et_verbrauch et_co2 el_verbrauch env_cat PRHnewprice weight TYPKW car_type_FE* total_reg nb_cars_ann car_combo year_reg, into(car_chars)

frame change car_chars
collapse (firstnm) MAKName TYPName1 drivetype car_size car_height et_verbrauch el_verbrauch env_cat PRHnewprice weight TYPKW car_type_FE* total_reg nb_cars_ann , by(car_combo year_reg) 
save "$root/Data/Produced/sample_car_chars.dta", replace

frame change soc_chars
save "$root/Data/Produced/sample_soc_chars.dta", replace

keep PID average_use_var TOT_EINK TOTVERM GKODX GKODY PVhh homeowner hhsize age age_group range5 dist_to_EV urban3 MAKName TYPName1 drivetype car_size  car_height et_verbrauch et_co2 el_verbrauch env_cat PRHnewprice weight TYPKW car_type_FE* car_combo year_reg

*Save the raw datasample:
save "$root/Data/Produced/EV_sample.dta", replace





